@@ -14,6 +14,46 @@ module WebRequestConcern |
||
14 | 14 |
end |
15 | 15 |
end |
16 | 16 |
|
17 |
+ class CharacterEncoding < Faraday::Middleware |
|
18 |
+ def initialize(app, force_encoding: nil, default_encoding: nil, unzip: nil) |
|
19 |
+ super(app) |
|
20 |
+ @force_encoding = force_encoding |
|
21 |
+ @default_encoding = default_encoding |
|
22 |
+ @unzip = unzip |
|
23 |
+ end |
|
24 |
+ |
|
25 |
+ def call(env) |
|
26 |
+ @app.call(env).on_complete do |env| |
|
27 |
+ body = env[:body] |
|
28 |
+ |
|
29 |
+ case @unzip |
|
30 |
+ when 'gzip'.freeze |
|
31 |
+ body.replace(ActiveSupport::Gzip.decompress(body)) |
|
32 |
+ end |
|
33 |
+ |
|
34 |
+ case |
|
35 |
+ when @force_encoding |
|
36 |
+ encoding = @force_encoding |
|
37 |
+ when body.encoding == Encoding::ASCII_8BIT |
|
38 |
+ # Not all Faraday adapters support automatic charset |
|
39 |
+ # detection, so we do that. |
|
40 |
+ case env[:response_headers][:content_type] |
|
41 |
+ when /;\s*charset\s*=\s*([^()<>@,;:\\\"\/\[\]?={}\s]+)/i |
|
42 |
+ encoding = Encoding.find($1) rescue nil |
|
43 |
+ when /\A\s*(?:text\/[^\s;]+|application\/(?:[^\s;]+\+)?(?:xml|json))\s*(?:;|\z)/i |
|
44 |
+ encoding = @default_encoding |
|
45 |
+ else |
|
46 |
+ # Never try to transcode a binary content |
|
47 |
+ return |
|
48 |
+ end |
|
49 |
+ end |
|
50 |
+ body.encode!(Encoding::UTF_8, encoding) unless body.encoding == Encoding::UTF_8 |
|
51 |
+ end |
|
52 |
+ end |
|
53 |
+ end |
|
54 |
+ |
|
55 |
+ Faraday::Response.register_middleware character_encoding: CharacterEncoding |
|
56 |
+ |
|
17 | 57 |
extend ActiveSupport::Concern |
18 | 58 |
|
19 | 59 |
def validate_web_request_options! |
@@ -34,6 +74,23 @@ module WebRequestConcern |
||
34 | 74 |
rescue ArgumentError => e |
35 | 75 |
errors.add(:base, e.message) |
36 | 76 |
end |
77 |
+ |
|
78 |
+ if (encoding = options['force_encoding']).present? |
|
79 |
+ case encoding |
|
80 |
+ when String |
|
81 |
+ begin |
|
82 |
+ Encoding.find(encoding) |
|
83 |
+ rescue ArgumentError |
|
84 |
+ errors.add(:base, "Unknown encoding: #{encoding.inspect}") |
|
85 |
+ end |
|
86 |
+ else |
|
87 |
+ errors.add(:base, "force_encoding must be a string") |
|
88 |
+ end |
|
89 |
+ end |
|
90 |
+ end |
|
91 |
+ |
|
92 |
+ def default_encoding |
|
93 |
+ Encoding::UTF_8 |
|
37 | 94 |
end |
38 | 95 |
|
39 | 96 |
def faraday |
@@ -44,6 +101,11 @@ module WebRequestConcern |
||
44 | 101 |
} |
45 | 102 |
|
46 | 103 |
@faraday ||= Faraday.new(faraday_options) { |builder| |
104 |
+ builder.response :character_encoding, |
|
105 |
+ force_encoding: interpolated['force_encoding'].presence, |
|
106 |
+ default_encoding: default_encoding, |
|
107 |
+ unzip: interpolated['unzip'].presence |
|
108 |
+ |
|
47 | 109 |
builder.headers = headers if headers.length > 0 |
48 | 110 |
|
49 | 111 |
builder.headers[:user_agent] = user_agent |
@@ -51,7 +113,7 @@ module WebRequestConcern |
||
51 | 113 |
builder.use FaradayMiddleware::FollowRedirects |
52 | 114 |
builder.request :url_encoded |
53 | 115 |
|
54 |
- if boolify(options['disable_url_encoding']) |
|
116 |
+ if boolify(interpolated['disable_url_encoding']) |
|
55 | 117 |
builder.options.params_encoder = DoNotEncoder |
56 | 118 |
end |
57 | 119 |
|
@@ -29,6 +29,7 @@ module Agents |
||
29 | 29 |
* `basic_auth` - Specify HTTP basic auth parameters: `"username:password"`, or `["username", "password"]`. |
30 | 30 |
* `disable_ssl_verification` - Set to `true` to disable ssl verification. |
31 | 31 |
* `disable_url_encoding` - Set to `true` to disable url encoding. |
32 |
+ * `force_encoding` - Set `force_encoding` to an encoding name if the website is known to respond with a missing, invalid or wrong charset in the Content-Type header. Note that a text content without a charset is taken as encoded in UTF-8 (not ISO-8859-1). |
|
32 | 33 |
* `user_agent` - A custom User-Agent name (default: "Faraday v#{Faraday::VERSION}"). |
33 | 34 |
* `max_events_per_run` - Limit number of events created (items parsed) per run for feed. |
34 | 35 |
|
@@ -87,7 +87,7 @@ module Agents |
||
87 | 87 |
|
88 | 88 |
Set `uniqueness_look_back` to limit the number of events checked for uniqueness (typically for performance). This defaults to the larger of #{UNIQUENESS_LOOK_BACK} or #{UNIQUENESS_FACTOR}x the number of detected received results. |
89 | 89 |
|
90 |
- Set `force_encoding` to an encoding name if the website does not return a Content-Type header with a proper charset. |
|
90 |
+ Set `force_encoding` to an encoding name if the website is known to respond with a missing, invalid or wrong charset in the Content-Type header. Note that a text content without a charset is taken as encoded in UTF-8 (not ISO-8859-1). |
|
91 | 91 |
|
92 | 92 |
Set `user_agent` to a custom User-Agent name if the website does not like the default value (`#{default_user_agent}`). |
93 | 93 |
|
@@ -157,19 +157,6 @@ module Agents |
||
157 | 157 |
errors.add(:base, "Invalid uniqueness_look_back format") unless is_positive_integer?(options['uniqueness_look_back']) |
158 | 158 |
end |
159 | 159 |
|
160 |
- if (encoding = options['force_encoding']).present? |
|
161 |
- case encoding |
|
162 |
- when String |
|
163 |
- begin |
|
164 |
- Encoding.find(encoding) |
|
165 |
- rescue ArgumentError |
|
166 |
- errors.add(:base, "Unknown encoding: #{encoding.inspect}") |
|
167 |
- end |
|
168 |
- else |
|
169 |
- errors.add(:base, "force_encoding must be a string") |
|
170 |
- end |
|
171 |
- end |
|
172 |
- |
|
173 | 160 |
validate_web_request_options! |
174 | 161 |
end |
175 | 162 |
|
@@ -284,12 +271,6 @@ module Agents |
||
284 | 271 |
interpolation_context.stack { |
285 | 272 |
interpolation_context['_response_'] = ResponseDrop.new(response) |
286 | 273 |
body = response.body |
287 |
- if (encoding = interpolated['force_encoding']).present? |
|
288 |
- body = body.encode(Encoding::UTF_8, encoding) |
|
289 |
- end |
|
290 |
- if interpolated['unzip'] == "gzip" |
|
291 |
- body = ActiveSupport::Gzip.decompress(body) |
|
292 |
- end |
|
293 | 274 |
doc = parse(body) |
294 | 275 |
|
295 | 276 |
if extract_full_json? |